Setup

library(tidyverse)
library(magrittr)
library(parallel)
library(pander)
library(here)
library(scales)
library(ggpubr)
library(kableExtra)
library(edgeR)
library(DT)
library(ggrepel)
library(pheatmap)
library(ggdendro)
if (interactive()) setwd(here::here())
theme_set(theme_bw())
cores <- detectCores() - 1
load(
  here::here("1_Psen2S4Ter/R/output/1_1_DE.RData")
)

k-mer analysis

jellyfish v2.3.0 was used to count kmers of trimmed fastq files that had been filtered for rRNA sequences. This was performed for 5 values: \(k = 5, 6, 7, 8, 9, 10\). Lower values of \(k\) lose specificity in comparison to higher values, however as \(k\) increases, the exponential increase of possible kmers causes limitations due to computational processing time.

k = 5

Counts

k5files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/6_jellyfish2pass/k5", pattern = "_dumps.txt", full.names = TRUE)
k5counts <- lapply(k5files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("WT", "Heter")))
k5dge <- k5counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k5dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k5dge$samples$group <- colnames(k5dge) %>%
  str_extract("(WT|Heter)") %>%
  factor(levels = c("WT", "Heter"))

Properties

k5dist <- k5dge %>%
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
  ggplot(aes(x=count, colour = sample)) +
  geom_density() +
  labs(x = "intensity", title = "Distribution of 5-mers")
k5labels <- k5dge$samples %>% 
  mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
  .$label
k5heat <- k5dge %>% 
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  pheatmap(silent = TRUE, cluster_cols = FALSE,
           show_colnames = FALSE, fontsize = 9,
           fontsize_row = 10, border_color = NA,
           main = "5-mer counts heatmap", labels_row = k5labels)
k5heat$tree_row$labels <- k5labels
k5den <- ggdendrogram(k5heat$tree_row, rotate = TRUE) +
  labs(title = "Hierarchical clustering of 5-mer counts") +
  theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k5pca <- k5dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k5pca)$importance %>% pander(split.tables = Inf)
k5pcaPlot <- k5pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k5dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k5pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k5pca)$importance[2, "PC2"]), ")"),
    colour = "Genotype",
    title = "k = 5"
  ) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k5pcaRrna <- k5pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k5dge$samples) %>%
  ggplot(aes(PC1, rRNA, label = rRNA)) +
  geom_point(aes(colour = group), alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(k5pca)$importance[2, "PC1"]), ")"),
    y = "rRNA proportion",
    colour = "Genotype",
    title = "k = 5"
  ) +
  scale_y_continuous(labels = percent) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k5design <- model.matrix(~rRNA, data = k5dge$samples)
k5voom <- voom(k5dge, design = k5design)
k5fit <- lmFit(k5voom, design = k5design)
k5eBayes <- eBayes(k5fit)
k5topTable <- k5eBayes %>%
  topTable(coef = colnames(k5design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(BY = p.adjust(P.Value, "BY")) %>%
  mutate(DE = BY < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    BY,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 6

Counts

k6files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/6_jellyfish2pass/k6", pattern = "_dumps.txt", full.names = TRUE)
k6counts <- lapply(k6files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("WT", "Heter")))
k6dge <- k6counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k6dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k6dge$samples$group <- colnames(k6dge) %>%
  str_extract("(WT|Heter)") %>%
  factor(levels = c("WT", "Heter"))

Properties

k6dist <- k6dge %>%
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
  ggplot(aes(x=count, colour = sample)) +
  geom_density() +
  labs(x = "intensity", title = "Distribution of 6-mers")
k6labels <- k6dge$samples %>% 
  mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
  .$label
k6heat <- k6dge %>% 
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  pheatmap(silent = TRUE, cluster_cols = FALSE,
           show_colnames = FALSE, fontsize = 9,
           fontsize_row = 10, border_color = NA,
           main = "6-mer counts heatmap", labels_row = k6labels)
k6heat$tree_row$labels <- k6labels
k6den <- ggdendrogram(k6heat$tree_row, rotate = TRUE) +
  labs(title = "Hierarchical clustering of 6-mer counts") +
  theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k6pca <- k6dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k6pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k6pcaPlot <- k6pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k6dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k6pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k6pca)$importance[2, "PC2"]), ")"),
    colour = "Genotype",
    title = "k = 6"
  ) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k6pcaRrna <- k6pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k6dge$samples) %>%
  ggplot(aes(PC1, rRNA, label = rRNA)) +
  geom_point(aes(colour = group), alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(k6pca)$importance[2, "PC1"]), ")"),
    y = "rRNA proportion",
    colour = "Genotype",
    title = "k = 6"
  ) +
  scale_y_continuous(labels = percent) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k6design <- model.matrix(~rRNA, data = k6dge$samples)
k6voom <- voom(k6dge, design = k6design)
k6fit <- lmFit(k6voom, design = k6design)
k6eBayes <- eBayes(k6fit)
k6topTable <- k6eBayes %>%
  topTable(coef = colnames(k6design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(BY = p.adjust(P.Value, "BY")) %>%
  mutate(DE = BY < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    BY,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 7

Counts

k7files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/6_jellyfish2pass/k7", pattern = "_dumps.txt", full.names = TRUE)
k7counts <- lapply(k7files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("WT", "Heter")))
k7dge <- k7counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k7dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k7dge$samples$group <- colnames(k7dge) %>%
  str_extract("(WT|Heter)") %>%
  factor(levels = c("WT", "Heter"))

Properties

k7dist <- k7dge %>%
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
  ggplot(aes(x=count, colour = sample)) +
  geom_density() +
  labs(x = "intensity", title = "Distribution of 7-mers")
k7labels <- k7dge$samples %>% 
  mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
  .$label
k7heat <- k7dge %>% 
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  pheatmap(silent = TRUE, cluster_cols = FALSE,
           show_colnames = FALSE, fontsize = 9,
           fontsize_row = 10, border_color = NA,
           main = "7-mer counts heatmap", labels_row = k7labels)
k7heat$tree_row$labels <- k7labels
k7den <- ggdendrogram(k7heat$tree_row, rotate = TRUE) +
  labs(title = "Hierarchical clustering of 7-mer counts") +
  theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k7pca <- k7dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k7pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k7pcaPlot <- k7pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k7dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k7pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k7pca)$importance[2, "PC2"]), ")"),
    colour = "Genotype",
    title = "k = 7"
  ) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k7pcaRrna <- k7pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k7dge$samples) %>%
  ggplot(aes(PC1, rRNA, label = rRNA)) +
  geom_point(aes(colour = group), alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(k7pca)$importance[2, "PC1"]), ")"),
    y = "rRNA proportion",
    colour = "Genotype",
    title = "k = 7"
  ) +
  scale_y_continuous(labels = percent) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k7design <- model.matrix(~rRNA, data = k7dge$samples)
k7voom <- voom(k7dge, design = k7design)
k7fit <- lmFit(k7voom, design = k7design)
k7eBayes <- eBayes(k7fit)
k7topTable <- k7eBayes %>%
  topTable(coef = colnames(k7design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(BY = p.adjust(P.Value, "BY")) %>%
  mutate(DE = BY < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    BY,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 8

Counts

k8files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/6_jellyfish2pass/k8", pattern = "_dumps.txt", full.names = TRUE)
k8counts <- lapply(k8files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("WT", "Heter")))
k8dge <- k8counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k8dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k8dge$samples$group <- colnames(k8dge) %>%
  str_extract("(WT|Heter)") %>%
  factor(levels = c("WT", "Heter"))

Properties

k8dist <- k8dge %>%
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
  ggplot(aes(x=count, colour = sample)) +
  geom_density() +
  labs(x = "intensity", title = "Distribution of 8-mers")
k8labels <- k8dge$samples %>% 
  mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
  .$label
k8heat <- k8dge %>% 
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  pheatmap(silent = TRUE, cluster_cols = FALSE,
           show_colnames = FALSE, fontsize = 9,
           fontsize_row = 10, border_color = NA,
           main = "8-mer counts heatmap", labels_row = k8labels)
k8heat$tree_row$labels <- k8labels
k8den <- ggdendrogram(k8heat$tree_row, rotate = TRUE) +
  labs(title = "Hierarchical clustering of 8-mer counts") +
  theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k8pca <- k8dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k8pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k8pcaPlot <- k8pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k8dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k8pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k8pca)$importance[2, "PC2"]), ")"),
    colour = "Genotype",
    title = "k = 8"
  ) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k8pcaRrna <- k8pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k8dge$samples) %>%
  ggplot(aes(PC1, rRNA, label = rRNA)) +
  geom_point(aes(colour = group), alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(k8pca)$importance[2, "PC1"]), ")"),
    y = "rRNA proportion",
    colour = "Genotype",
    title = "k = 8"
  ) +
  scale_y_continuous(labels = percent) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k8design <- model.matrix(~rRNA, data = k8dge$samples)
k8voom <- voom(k8dge, design = k8design)
k8fit <- lmFit(k8voom, design = k8design)
k8eBayes <- eBayes(k8fit)
k8topTable <- k8eBayes %>%
  topTable(coef = colnames(k8design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(BY = p.adjust(P.Value, "BY")) %>%
  mutate(DE = BY < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    BY,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 9

Counts

k9files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/6_jellyfish2pass/k9", pattern = "_dumps.txt", full.names = TRUE)
k9counts <- lapply(k9files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("WT", "Heter")))
k9dge <- k9counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k9dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k9dge$samples$group <- colnames(k9dge) %>%
  str_extract("(WT|Heter)") %>%
  factor(levels = c("WT", "Heter"))

Properties

k9dist <- k9dge %>%
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
  ggplot(aes(x=count, colour = sample)) +
  geom_density() +
  labs(x = "intensity", title = "Distribution of 9-mers")
k9labels <- k9dge$samples %>% 
  mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
  .$label
k9heat <- k9dge %>% 
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  pheatmap(silent = TRUE, cluster_cols = FALSE,
           show_colnames = FALSE, fontsize = 9,
           fontsize_row = 10, border_color = NA,
           main = "9-mer counts heatmap", labels_row = k9labels)
k9heat$tree_row$labels <- k9labels
k9den <- ggdendrogram(k9heat$tree_row, rotate = TRUE) +
  labs(title = "Hierarchical clustering of 9-mer counts") +
  theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k9pca <- k9dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k9pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k9pcaPlot <- k9pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k9dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k9pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k9pca)$importance[2, "PC2"]), ")"),
    colour = "Genotype",
    title = "k = 9"
  ) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k9pcaRrna <- k9pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k9dge$samples) %>%
  ggplot(aes(PC1, rRNA, label = rRNA)) +
  geom_point(aes(colour = group), alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(k9pca)$importance[2, "PC1"]), ")"),
    y = "rRNA proportion",
    colour = "Genotype",
    title = "k = 9"
  ) +
  scale_y_continuous(labels = percent) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k9design <- model.matrix(~rRNA, data = k9dge$samples)
k9voom <- voom(k9dge, design = k9design)
k9fit <- lmFit(k9voom, design = k9design)
k9eBayes <- eBayes(k9fit)
k9topTable <- k9eBayes %>%
  topTable(coef = colnames(k9design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(BY = p.adjust(P.Value, "BY")) %>%
  mutate(DE = BY < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    BY,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

k = 10

Counts

k10files <- list.files("/hpcfs/users/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/6_jellyfish2pass/k10", pattern = "_dumps.txt", full.names = TRUE)
k10counts <- lapply(k10files, function(x){
  read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
    set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
  purrr::reduce(full_join) %>%
  dplyr::select(mer, contains(c("WT", "Heter")))
k10dge <- k10counts %>%
  as.data.frame() %>%
  column_to_rownames("mer") %>%
  DGEList() %>%
  calcNormFactors()
k10dge$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(addInfo) %>%
  column_to_rownames("rowname")
k10dge$samples$group <- colnames(k10dge) %>%
  str_extract("(WT|Heter)") %>%
  factor(levels = c("WT", "Heter"))

Properties

k10dist <- k10dge %>%
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
  ggplot(aes(x=count, colour = sample)) +
  geom_density() +
  labs(x = "intensity", title = "Distribution of 10-mers")
k10labels <- k10dge$samples %>% 
  mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
  .$label
k10heat <- k10dge %>% 
  cpm(log = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  pheatmap(silent = TRUE, cluster_cols = FALSE,
           show_colnames = FALSE, fontsize = 9,
           fontsize_row = 10, border_color = NA,
           main = "10-mer counts heatmap", labels_row = k10labels)
k10heat$tree_row$labels <- k10labels
k10den <- ggdendrogram(k10heat$tree_row, rotate = TRUE) +
  labs(title = "Hierarchical clustering of 10-mer counts") +
  theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k10pca <- k10dge %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k10pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k10pcaPlot <- k10pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k10dge$samples) %>%
  ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
  geom_point(alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  labs(
    x = paste0("PC1 (", percent(summary(k10pca)$importance[2, "PC1"]), ")"),
    y = paste0("PC2 (", percent(summary(k10pca)$importance[2, "PC2"]), ")"),
    colour = "Genotype",
    title = "k = 10"
  ) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k10pcaRrna <- k10pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(k10dge$samples) %>%
  ggplot(aes(PC1, rRNA, label = rRNA)) +
  geom_point(aes(colour = group), alpha = 0.8, size = 3) +
  geom_text_repel(show.legend = FALSE) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(k10pca)$importance[2, "PC1"]), ")"),
    y = "rRNA proportion",
    colour = "Genotype",
    title = "k = 10"
  ) +
  scale_y_continuous(labels = percent) +
  scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k10design <- model.matrix(~rRNA, data = k10dge$samples)
k10voom <- voom(k10dge, design = k10design)
k10fit <- lmFit(k10voom, design = k10design)
k10eBayes <- eBayes(k10fit)
k10topTable <- k10eBayes %>%
  topTable(coef = colnames(k10design)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  rownames_to_column("mer") %>%
  mutate(BY = p.adjust(P.Value, "BY")) %>%
  mutate(DE = BY < 0.05) %>%
  dplyr::select(
    mer,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    BY,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

Distributions

ggarrange(
  k5dist, k6dist, k7dist, k8dist, k9dist, k10dist,
  ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom" 
)
*Distributions of kmer counts for each value of $k$*

Distributions of kmer counts for each value of \(k\)

PCA

ggarrange(
  k5pcaPlot, k6pcaPlot, k7pcaPlot, k8pcaPlot, k9pcaPlot, k10pcaPlot,
  ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom"
)
*Principal component analysis for all values of $k$. As $k$ increases, WT and mutant groups start to separate.*

Principal component analysis for all values of \(k\). As \(k\) increases, WT and mutant groups start to separate.

ggarrange(
  k5pcaRrna, k6pcaRrna, k7pcaRrna, k8pcaRrna, k9pcaRrna, k10pcaRrna,
  ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom"
)
*Principal component analysis for all values of $k$. As $k$ increases, WT and mutant groups start to separate.*

Principal component analysis for all values of \(k\). As \(k\) increases, WT and mutant groups start to separate.

Clustering

ggarrange(
  k5den, k6den, k7den, k8den, k9den, k10den,
  ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom"
)
*Hierarchical clustering of samples based on kmer counts. As $k$ increases, WT and mutant groups start to cluster.*

Hierarchical clustering of samples based on kmer counts. As \(k\) increases, WT and mutant groups start to cluster.

Differential expression

topRes <- function(x, cap){
  x %>% 
    dplyr::select(mer, AveExpr, logFC, P.Value, FDR, BY, DE) %>%
    mutate(
      AveExpr = format(round(AveExpr, 2), nsmall = 2),
      logFC = format(round(logFC, 2), nsmall = 2),
      P.Value = sprintf("%.2e", P.Value),
      FDR = sprintf("%.2e", FDR),
      BY = sprintf("%.2e", BY)
    ) %>%
    dplyr::slice(1:100) %>%
    datatable(
      options = list(pageLength = 10),
      class = "striped hover condensed responsive",
      filter = "top",
      caption = cap
    )
}

k = 5

topRes(k5topTable,
       cap = paste(
         "The top 100 differentially expressed 5-mers.",
         nrow(dplyr::filter(k5topTable, DE)),
         "of",
         nrow(k5topTable),
         "detected sequences were classified as DE with BY p-value < 0.05."
       )
)
k5topTable %>%
  ggplot(aes(P.Value)) +
  geom_histogram(
    binwidth = 0.02,
    colour = "black", fill = "grey90"
  ) +
  labs(title = "k = 5")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k5topTable %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  scale_colour_manual(values = c("black", "grey50", "red")) +
  geom_text_repel(
    data = . %>%
      # dplyr::filter(-log10(P.Value) > 4 | logFC > 4 | logFC < -2),
      dplyr::filter(-log10(P.Value) > 4 | logFC > 3 | logFC < -2.5),
    aes(label = mer, color = "black")
  ) +
  labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 5") +
  theme_bw() +
  theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 6

topRes(k6topTable,
       cap = paste(
         "The top 100 differentially expressed 6-mers.",
         nrow(dplyr::filter(k6topTable, DE)),
         "of",
         nrow(k6topTable),
         "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k6topTable %>%
  ggplot(aes(P.Value)) +
  geom_histogram(
    binwidth = 0.02,
    colour = "black", fill = "grey90"
  ) +
  labs(title = "k = 6")
*Histogram of p-values. Values follow the expected distribution when there are some differences.*

Histogram of p-values. Values follow the expected distribution when there are some differences.

k6topTable %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  scale_colour_manual(values = c("black", "grey50", "red")) +
  geom_text_repel(
    data = . %>%
      dplyr::filter(-log10(P.Value) > 6 | logFC > 6 | logFC < -2.3),
    aes(label = mer, color = "black")
  ) +
  labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 6") +
  theme_bw() +
  theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 7

topRes(k7topTable,
       cap = paste(
         "The top 100 differentially expressed 7-mers.",
         nrow(dplyr::filter(k7topTable, DE)),
         "of",
         nrow(k7topTable),
         "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k7topTable %>%
  ggplot(aes(P.Value)) +
  geom_histogram(
    binwidth = 0.02,
    colour = "black", fill = "grey90"
  ) +
  labs(title = "k = 7")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k7topTable %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  scale_colour_manual(values = c("black", "grey50", "red")) +
  geom_text_repel(
    data = . %>%
      dplyr::filter(-log10(P.Value) > 6.4 | logFC > 10 | logFC < -5),
    aes(label = mer, color = "black")
  ) +
  labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 7") +
  theme_bw() +
  theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 8

topRes(k8topTable,
       cap = paste(
         "The top 100 differentially expressed 8-mers.",
         nrow(dplyr::filter(k8topTable, DE)),
         "of",
         nrow(k8topTable),
         "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k8topTable %>%
  ggplot(aes(P.Value)) +
  geom_histogram(
    binwidth = 0.02,
    colour = "black", fill = "grey90"
  ) +
  labs(title = "k = 8")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k8topTable %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  scale_colour_manual(values = c("black", "grey50", "red")) +
  geom_text_repel(
    data = . %>%
      dplyr::filter(-log10(P.Value) > 7.2 | logFC > 12.5 | logFC < -8),
    aes(label = mer, color = "black")
  ) +
  labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 8") +
  theme_bw() +
  theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 9

topRes(k9topTable,
       cap = paste(
         "The top 100 differentially expressed 9-mers.",
         nrow(dplyr::filter(k9topTable, DE)),
         "of",
         nrow(k9topTable),
         "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k9topTable %>%
  ggplot(aes(P.Value)) +
  geom_histogram(
    binwidth = 0.02,
    colour = "black", fill = "grey90"
  ) +
  labs(title = "k = 9")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k9topTable %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  scale_colour_manual(values = c("black", "grey50", "red")) +
  geom_text_repel(
    data = . %>%
      dplyr::filter(DE & -log10(P.Value) > 7.5 | logFC < -10 | logFC > 15),
    aes(label = mer, color = "black")
  ) +
  labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 9") +
  theme_bw() +
  theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 10

topRes(k10topTable,
       cap = paste(
         "The top 100 differentially expressed 10-mers.",
         nrow(dplyr::filter(k10topTable, DE)),
         "of",
         nrow(k10topTable),
         "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k10topTable %>%
  ggplot(aes(P.Value)) +
  geom_histogram(
    binwidth = 0.02,
    colour = "black", fill = "grey90"
  ) +
  labs(title = "k = 10")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k10topTable %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  scale_colour_manual(values = c("black", "grey50", "red")) +
  geom_text_repel(
    data = . %>%
      dplyr::filter(-log10(P.Value) > 8.5 | logFC > 20 | logFC < -13.5),
    aes(label = mer, color = "black")
  ) +
  labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 10") +
  theme_bw() +
  theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.

Session info

save(
  k5topTable,
  k6topTable,
  k7topTable,
  k8topTable,
  k9topTable,
  k10topTable,
  file = here::here(
    "1_Psen2S4Ter/R/output/1_2_kmer.RData"
  )
)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggdendro_0.1.22  pheatmap_1.0.12  ggrepel_0.8.2    DT_0.14         
##  [5] edgeR_3.30.3     limma_3.44.3     kableExtra_1.1.0 ggpubr_0.4.0    
##  [9] scales_1.1.1     here_0.1         pander_0.6.3     magrittr_1.5    
## [13] forcats_0.5.0    stringr_1.4.0    dplyr_1.0.0      purrr_0.3.4     
## [17] readr_1.3.1      tidyr_1.1.0      tibble_3.0.3     ggplot2_3.3.2   
## [21] tidyverse_1.3.0 
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-149       fs_1.4.2           lubridate_1.7.9    webshot_0.5.2     
##  [5] RColorBrewer_1.1-2 httr_1.4.1         rprojroot_1.3-2    tools_4.0.3       
##  [9] backports_1.1.8    R6_2.4.1           mgcv_1.8-33        DBI_1.1.0         
## [13] colorspace_1.4-1   withr_2.2.0        tidyselect_1.1.0   gridExtra_2.3     
## [17] curl_4.3           compiler_4.0.3     cli_2.0.2          rvest_0.3.5       
## [21] xml2_1.3.2         labeling_0.3       digest_0.6.25      foreign_0.8-80    
## [25] rmarkdown_2.3      rio_0.5.16         pkgconfig_2.0.3    htmltools_0.5.0   
## [29] dbplyr_1.4.4       highr_0.8          htmlwidgets_1.5.1  rlang_0.4.7       
## [33] readxl_1.3.1       rstudioapi_0.11    generics_0.0.2     farver_2.0.3      
## [37] jsonlite_1.7.0     crosstalk_1.1.0.1  zip_2.0.4          car_3.0-8         
## [41] Matrix_1.2-18      Rcpp_1.0.5         munsell_0.5.0      fansi_0.4.1       
## [45] abind_1.4-5        lifecycle_0.2.0    stringi_1.4.6      yaml_2.2.1        
## [49] carData_3.0-4      MASS_7.3-53        grid_4.0.3         blob_1.2.1        
## [53] crayon_1.3.4       lattice_0.20-41    splines_4.0.3      haven_2.3.1       
## [57] cowplot_1.0.0      hms_0.5.3          locfit_1.5-9.4     knitr_1.29        
## [61] pillar_1.4.6       ggsignif_0.6.0     reprex_0.3.0       glue_1.4.1        
## [65] evaluate_0.14      data.table_1.12.8  modelr_0.1.8       vctrs_0.3.2       
## [69] cellranger_1.1.0   gtable_0.3.0       assertthat_0.2.1   xfun_0.15         
## [73] openxlsx_4.1.5     broom_0.7.0        rstatix_0.6.0      viridisLite_0.3.0 
## [77] ellipsis_0.3.1